{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "2e25cf66",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle\n",
    "\n",
    "import pandas as pd\n",
    "\n",
    "from sklearn.feature_extraction import DictVectorizer\n",
    "from sklearn.ensemble import RandomForestRegressor\n",
    "from sklearn.metrics import mean_squared_error"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "041e9423",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.pipeline import make_pipeline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "506bec39",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2022/06/01 12:27:06 INFO mlflow.tracking.fluent: Experiment with name 'green-taxi-duration' does not exist. Creating a new experiment.\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<Experiment: artifact_location='s3://mlflow-models-alexey/1', experiment_id='1', lifecycle_stage='active', name='green-taxi-duration', tags={}>"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import mlflow\n",
    "\n",
    "mlflow.set_tracking_uri(\"http://127.0.0.1:5000\")\n",
    "mlflow.set_experiment(\"green-taxi-duration\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "b9666e19",
   "metadata": {},
   "outputs": [],
   "source": [
    "def read_dataframe(filename: str):\n",
    "    df = pd.read_parquet(filename)\n",
    "\n",
    "    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime\n",
    "    df.duration = df.duration.dt.total_seconds() / 60\n",
    "    df = df[(df.duration >= 1) & (df.duration <= 60)]\n",
    "\n",
    "    categorical = ['PULocationID', 'DOLocationID']\n",
    "    df[categorical] = df[categorical].astype(str)\n",
    "    return df\n",
    "\n",
    "\n",
    "def prepare_dictionaries(df: pd.DataFrame):\n",
    "    df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']\n",
    "    categorical = ['PU_DO']\n",
    "    numerical = ['trip_distance']\n",
    "    dicts = df[categorical + numerical].to_dict(orient='records')\n",
    "    return dicts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "6b5f0d80",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_train = read_dataframe('data/green_tripdata_2021-01.parquet')\n",
    "df_val = read_dataframe('data/green_tripdata_2021-02.parquet')\n",
    "\n",
    "target = 'duration'\n",
    "y_train = df_train[target].values\n",
    "y_val = df_val[target].values\n",
    "\n",
    "dict_train = prepare_dictionaries(df_train)\n",
    "dict_val = prepare_dictionaries(df_val)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "c73081e1",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'max_depth': 20, 'n_estimators': 100, 'min_samples_leaf': 10, 'random_state': 0} 15.136777093556063\n"
     ]
    }
   ],
   "source": [
    "with mlflow.start_run():\n",
    "    params = dict(max_depth=20, n_estimators=100, min_samples_leaf=10, random_state=0)\n",
    "    mlflow.log_params(params)\n",
    "\n",
    "    pipeline = make_pipeline(\n",
    "        DictVectorizer(),\n",
    "        RandomForestRegressor(**params, n_jobs=-1)\n",
    "    )\n",
    "\n",
    "    pipeline.fit(dict_train, y_train)\n",
    "    y_pred = pipeline.predict(dict_val)\n",
    "\n",
    "    rmse = mean_squared_error(y_pred, y_val, squared=False)\n",
    "    print(params, rmse)\n",
    "    mlflow.log_metric('rmse', rmse)\n",
    "\n",
    "    mlflow.sklearn.log_model(pipeline, artifact_path=\"model\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "c22bbccb",
   "metadata": {},
   "outputs": [],
   "source": [
    "from mlflow.tracking import MlflowClient\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "8ff6b57e",
   "metadata": {},
   "outputs": [],
   "source": [
    "MLFLOW_TRACKING_URI = 'http://127.0.0.1:5000'\n",
    "RUN_ID = 'b4d3bca8aa8e46a6b8257fe4541b1136'\n",
    "\n",
    "client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "524f7008",
   "metadata": {},
   "outputs": [],
   "source": [
    "path = client.download_artifacts(run_id=RUN_ID, path='dict_vectorizer.bin')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "d760f7ed",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(path, 'rb') as f_out:\n",
    "    dv = pickle.load(f_out)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "c35deb48",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DictVectorizer()"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "529caf33",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
